Importing libraries

In [116]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random
import math
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator

Loading the dataset

In [2]:
df = pd.read_csv("time_series_covid19_confirmed_global.csv")
df.head()
Out[2]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 32672 32951 33190 33384 33594 33908 34194 34366 34451 34455
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 2819 2893 2964 3038 3106 3188 3278 3371 3454 3571
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 15500 15941 16404 16879 17348 17808 18242 18712 19195 19689
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 855 855 855 855 855 855 855 855 855 858
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 346 346 346 386 386 396 458 462 506 525

5 rows × 178 columns

In [3]:
df1 = df.copy()
df1.drop(["Province/State","Lat","Long"],inplace=True,axis=1)
In [4]:
df1 = df1.melt(id_vars=["Country/Region"], var_name="Date", value_name="Value")
In [5]:
fig = px.line(df1, x="Date", y="Value", title='Change in Confirmed cases with time',color='Country/Region')
fig.show()

Top 5 Countries with Highest Cases

In [162]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:5]
Out[162]:
Country/Region 7/13/20
174 US 3364157
23 Brazil 1884967
79 India 906752
140 Russia 732547
134 Peru 330123
In [164]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Russia']['Date'], y=df1[df1["Country/Region"] == 'Russia']['Value'],
                    mode='lines',
                    name='Russia'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Peru']['Date'], y=df1[df1["Country/Region"] == 'Peru']['Value'],
                    mode='lines',
                    name='Peru'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Confirmed Cases) for Countries with Highest Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Confirmed Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Top 5 Countries with lowest Number of Confirmed cases`

In [165]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20')[:5]
Out[165]:
Country/Region 7/13/20
104 MS Zaandam 9
184 Western Sahara 10
132 Papua New Guinea 11
75 Holy See 12
142 Saint Kitts and Nevis 17
In [166]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Western Sahara']['Date'], y=df1[df1["Country/Region"] == 'Western Sahara']['Value'],
                    mode='lines',
                    name='Western Sahara'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Papua New Guinea']['Date'], y=df1[df1["Country/Region"] == 'Papua New Guinea']['Value'],
                    mode='lines',
                    name='Papua New Guinea'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'MS Zaandam']['Date'], y=df1[df1["Country/Region"] == 'MS Zaandam']['Value'],
                    mode='lines',
                    name='MS Zaandam'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Holy See']['Date'], y=df1[df1["Country/Region"] == 'Holy See']['Value'],
                    mode='lines',
                    name='Holy See'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Saint Kitts and Nevis']['Date'], y=df1[df1["Country/Region"] == 'Saint Kitts and Nevis']['Value'],
                    mode='lines',
                    name='Saint Kitts and Nevis'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Confirmed Cases) for Countries with Lowest Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Confirmed Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Log of Confirmed Cases Over time

In [10]:
fig = px.line(df1, x="Date", y="Value", title='Log of cases over time for all the countries',color='Country/Region')
fig.update_layout(yaxis_type="log",
                 yaxis = dict(title_text = "log(Confirmed Cases)"))
fig.show()
In [167]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Russia']['Date'], y=df1[df1["Country/Region"] == 'Russia']['Value'],
                    mode='lines',
                    name='Russia'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Peru']['Date'], y=df1[df1["Country/Region"] == 'Peru']['Value'],
                    mode='lines',
                    name='Peru'))
fig.update_layout(
    title = "Log of cases over time for top 5 countries",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Confirmed Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Country Specific Graphs

Distribution of Confirmed cases in U.S.

In [108]:
f,ax = plt.subplots(figsize=(15,6))
sns.distplot(df1[df1["Country/Region"] == "US"]["Value"])
plt.xlabel("Confirmed Cases")
plt.ylabel("Days")
plt.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Confirmed cases in U.S.

In [106]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.update_layout(
    title = "Confirmed Cases in U.S.",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Confirmed Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Log of Confirmed Cases in U.S.

In [107]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))

fig.update_layout(
    title = "Log of Confirmed Cases in U.S.",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Confirmed Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Confirmed Cases in India

In [109]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))
fig.update_layout(
    title = "Confirmed Cases in India",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Confirmed Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Log of Cases Confirmed in India

In [110]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines',
                    name='India'))

fig.update_layout(
    title = "Log of Confirmed Cases in India",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Confirmed Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Confirmed Cases in Spain

In [114]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='Spain'))
fig.update_layout(
    title = "Confirmed Cases in Spain",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Confirmed Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Log of Confirmed Cases in Spain

In [115]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='China'))

fig.update_layout(
    title = "Log of Confirmed Cases in Spain",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Confirmed Cases)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Confirmed Cases all over the world

In [173]:
country_tot = df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:20]
In [175]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=country_tot["Country/Region"],
    x= country_tot["7/13/20"],
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=2)
    )
))
fig.update_layout(
    title = "Confirmed Cases all over the world",
    xaxis = dict(
        title_text = "Cases",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Country",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

World Daily increase in Confirmed cases

In [88]:
world_daily = df1.groupby("Date").sum().reset_index()
In [177]:
fig = go.Figure(go.Bar(
            x=world_daily["Date"],
            y=world_daily["Value"],
            orientation='v'))
fig.update_layout(
    title = "World Daily rise in Confirmed Cases",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Cases",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Prediction Part

In [27]:
df.head()
Out[27]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 32672 32951 33190 33384 33594 33908 34194 34366 34451 34455
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 2819 2893 2964 3038 3106 3188 3278 3371 3454 3571
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 15500 15941 16404 16879 17348 17808 18242 18712 19195 19689
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 855 855 855 855 855 855 855 855 855 858
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 346 346 346 386 386 396 458 462 506 525

5 rows × 178 columns

In [14]:
columns = df.keys()
confirmed = df.loc[:, columns[4]:columns[-1]]
In [16]:
dates = confirmed.keys()
world_cases = []

for i in dates:
    confirmed_sum = confirmed[i].sum()
    world_cases.append(confirmed_sum)
In [17]:
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
In [18]:
days_in_future = 15
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-15]
In [19]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))
In [20]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, world_cases, test_size=0.15, shuffle=False) 

Prediction using Linear Regression

In [21]:
linear_model = LinearRegression(normalize=True, fit_intercept=True)
linear_model.fit(X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(X_test_confirmed)
linear_pred = linear_model.predict(future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))
MAE: 3499716.1981492247
MSE: 13277206245372.941
In [22]:
print(linear_model.coef_)
print(linear_model.intercept_)
[[55153.42133299]]
[-1755182.31513146]
In [25]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(future_forcast, linear_pred, linestyle='dashed', color='orange')
plt.title('Number of Covid Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of Cases', size=30)
plt.legend(['Confirmed Cases', 'Linear Regression Predictions'])
plt.xticks(size=15)
plt.show()

Future predictions using Linear Regression

In [26]:
print('Linear regression future predictions:')
print(linear_pred[-15:])
Linear regression future predictions:
[[7841512.99680894]
 [7896666.41814193]
 [7951819.83947492]
 [8006973.26080791]
 [8062126.68214091]
 [8117280.1034739 ]
 [8172433.52480689]
 [8227586.94613988]
 [8282740.36747287]
 [8337893.78880586]
 [8393047.21013885]
 [8448200.63147184]
 [8503354.05280483]
 [8558507.47413782]
 [8613660.89547081]]

Prediction using Support Vector Machines

In [93]:
svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed)
svm_pred = svm_confirmed.predict(future_forcast)
C:\Users\Saurabh\Anaconda3\lib\site-packages\sklearn\utils\validation.py:752: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

In [101]:
# check against testing data
svm_test_pred = svm_confirmed.predict(X_test_confirmed)
plt.figure(figsize=(15,6))
plt.plot(y_test_confirmed)
plt.plot(svm_test_pred)
plt.legend(['Test Data', 'SVM Predictions'])
print('MAE:', mean_absolute_error(svm_test_pred, y_test_confirmed))
print('MSE:',mean_squared_error(svm_test_pred, y_test_confirmed))
MAE: 805888.8250874875
MSE: 693174902660.4805
In [178]:
x = adjusted_dates
y = world_cases
pred = svm_pred
algo_name = 'SVM Predictions'
color = 'purple'
plt.figure(figsize=(15, 8))
plt.plot(x, y)
plt.plot(future_forcast, pred, linestyle='dashed', color=color)
plt.title('Worldwide Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of Cases', size=30)
plt.legend(['Confirmed Cases', algo_name], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
In [ ]: